## [1] "/home/guanshim/Documents/gitlab/Omics_Integration/DataRaw/hiv_infected_un"

0.1 Test outliers

# source functions source to get the
# load_filtered_micro_level function to get clr of RA
source(paste0(dir, "Code/5_29_Generate_filtered_Data_Microbiome.R"))
# clean and transform transcriptome data, subset of genes
source(paste0(dir, "Code/6_5_clean_transcriptome.R"))
# clinical data
source(paste0(dir, "Code/6_5_clean_clinical.R"))
# diagnostic plots and tables
source(paste0(dir, "Code/ref_plots.R"))
# outliers
source(paste0(dir, "Code/outliers.R"))
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "No gene list provided, will use the whole Transcriptome"
# wrappers
source(paste0(dir, "Code/wrappers.R"))
# run smccnet
source(paste0(dir, "Code/put_together.R"))

########## Datasets ############# phenotype contains ID ####
clin <- rescaled_cli()
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "No gene list provided, will use the whole Transcriptome"
# where na in LPS
n_na <- which(is.na(clin$LPS))
sum(is.na(clin$CD14))
## [1] 0
# LPS and CD14 as dataframe
LPS <- clin %>% select(LPS)
CD14 <- clin %>% select(CD14)

##### Transcriptome ######
rna_isgs <- as.data.frame(read.delim(paste0(dir, "DataRaw/hiv_infected_un/coreISG"))) %>% 
    rescaled_rna(., rlog = T)
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "Use a subset of genes"
# the isgs data
isgs_rlog <- rna_isgs[[1]]
# names
colnames(isgs_rlog) <- rna_isgs[[2]]$Symbol

rna_genesbeta <- as.data.frame(read.delim(paste0(dir, "DataRaw/hiv_infected_un/genesbeta"))) %>% 
    rescaled_rna(., rlog = T)
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "Use a subset of genes"
# the isgs data
genesbeta_rlog <- rna_genesbeta[[1]]
# names
colnames(genesbeta_rlog) <- rna_genesbeta[[2]]$Symbol


######## microbiome ###### Microbiome
micro_data <- load_filtered_micro_level_samples("genus", prevalence = 40, 
    RA = 2, wd = "Ubuntu")
## Found "Unclassified" category in input data
## Created new "Other" category.
## Converted 35400 counts to "Other" otu category.
## Remaining OTUS: 270  (Including "Other").
## 
## Prevalence cutoff: 40% (i.e., at least 40% of libaries must be represented to keep OTU)
## Found 'Other' category in input data.
## Collapsed 195 OTUs to 'Other' in OTU table.
## Converted 80457 counts to 'Other' in OTU table.
## Remaining OTUs: 75  (Including 'Other').
## 
## Relative abundance cutoff: 2 % (i.e., at least one library must have RA > 2 % to keep OTU).
## Found "Other" category in input data.
## Collapsed 21 OTUs to "Other" otu category.
## Converted 47123 counts to "Other" otu category.
## Remaining OTUS: 54  (Including "Other").
## 
## Contains 27 subjects/libraries from Explicet OTU file.
names(micro_data)
## [1] "filtered_RA"  "filtered_clr" "full_RA"      "library_size"
micro_clr <- micro_data[[2]] %>% as.data.frame()

# rescale to mean 0 and variance 1
mibi <- rescale_microbiome(micro_clr)
######### outliers in clinical parameter ##############
# for lps 
dot_groupby(clin, clin$LPS, clin$Group, 0.8, "HIV Status", "LPS")

dot_groupby(clin, clin$LPS, 1, 0.8, "The whole cohort", "LPS")

grubbs_wrapper(LPS[,1], 10)
## [1] "Outlier"
grubbs.test(LPS[,1], # a numeric vector for data values.
            type = 10, # Integer value indicating test variant.
            # 10 is a test for one outlier (side is detected automatically and can be reversed by opposite parameter).
            # 11 is a test for two outliers on opposite tails, 20 is test for two outliers in one tail.
            opposite = FALSE, # a logical indicating whether you want to check not the value with largest difference from the mean
            two.sided = FALSE)
## 
##  Grubbs test for one outlier
## 
## data:  LPS[, 1]
## G = 2.69400, U = 0.69807, p-value = 0.04735
## alternative hypothesis: highest value 2.69404391214899 is an outlier
dot_groupby(clin, clin$CD14, clin$Group, 0.8, "HIV Status", "CD14")

dot_groupby(clin, clin$CD14, 1, 0.8, "The whole cohort", "CD14")

# one outlier
grubbs.test(CD14[,1], 
          type = 10)
## 
##  Grubbs test for one outlier
## 
## data:  CD14[, 1]
## G = 3.20800, U = 0.58895, p-value = 0.004239
## alternative hypothesis: highest value 3.2080477151983 is an outlier
# two opposite
grubbs.test(CD14[,1], 
          type = 11)
## 
##  Grubbs test for two opposite outliers
## 
## data:  CD14[, 1]
## G = 4.81220, U = 0.50123, p-value = 0.05031
## alternative hypothesis: -1.60419928217619 and 3.2080477151983 are outliers
# two ouliers
grubbs.test(CD14[,1], 
          type = 20)
## 
##  Grubbs test for two outliers
## 
## data:  CD14[, 1]
## U = 0.33938, p-value < 2.2e-16
## alternative hypothesis: highest values 2.37443012510193 , 3.2080477151983 are outliers
### test per group hiv ##
grubbs_wrapper(LPS[,1], 10, "hiv")
## [1] "Nonoutlier"
grubbs_wrapper(LPS[,1], 20, "hiv")
##            1 
## "Nonoutlier"
grubbs_wrapper(CD14[,1], 10, "hiv")
## [1] "Outlier"
grubbs_wrapper(CD14[,1], 10, "control")
## [1] "Nonoutlier"
grubbs_wrapper(CD14[,1], 20, "hiv")
##         1 
## "Outlier"
grubbs_wrapper(CD14[,1], 11, "hiv")
## [1] "Nonoutlier"
####### outlier in mibi ############
sapply(colnames(mibi)[1:10], FUN = function(x){
  dot_groupby(mibi, mibi[,x], clin$Group, 0.8, "HIV Status", 
              paste("Rescaled-clr of", x) )
})

##             Actin.Actin.Bifidobacterium Actin.Actin.Rhodococcus
## data        List,54                     List,54                
## layers      List,2                      List,2                 
## scales      ?                           ?                      
## mapping     List,2                      List,2                 
## theme       List,59                     List,59                
## coordinates ?                           ?                      
## facet       ?                           ?                      
## plot_env    ?                           ?                      
## labels      List,2                      List,2                 
##             Actin.Corio.Collinsella Bacte.Bacte.Alistipes
## data        List,54                 List,54              
## layers      List,2                  List,2               
## scales      ?                       ?                    
## mapping     List,2                  List,2               
## theme       List,59                 List,59              
## coordinates ?                       ?                    
## facet       ?                       ?                    
## plot_env    ?                       ?                    
## labels      List,2                  List,2               
##             Bacte.Bacte.Bacteroides Bacte.Bacte.Barnesiella
## data        List,54                 List,54                
## layers      List,2                  List,2                 
## scales      ?                       ?                      
## mapping     List,2                  List,2                 
## theme       List,59                 List,59                
## coordinates ?                       ?                      
## facet       ?                       ?                      
## plot_env    ?                       ?                      
## labels      List,2                  List,2                 
##             Bacte.Bacte.Parabacteroides Bacte.Bacte.Prevotella
## data        List,54                     List,54               
## layers      List,2                      List,2                
## scales      ?                           ?                     
## mapping     List,2                      List,2                
## theme       List,59                     List,59               
## coordinates ?                           ?                     
## facet       ?                           ?                     
## plot_env    ?                           ?                     
## labels      List,2                      List,2                
##             Bacte.Bacte.Prevotellaceae Bacte.Bacte.S24.7
## data        List,54                    List,54          
## layers      List,2                     List,2           
## scales      ?                          ?                
## mapping     List,2                     List,2           
## theme       List,59                    List,59          
## coordinates ?                          ?                
## facet       ?                          ?                
## plot_env    ?                          ?                
## labels      List,2                     List,2
sapply(colnames(mibi)[1:10], FUN = function(x){
  dot_groupby(mibi, mibi[,x], 1, 0.8, "The whole cohort", 
              paste("Rescaled-clr of", x) )
})

##             Actin.Actin.Bifidobacterium Actin.Actin.Rhodococcus
## data        List,54                     List,54                
## layers      List,2                      List,2                 
## scales      ?                           ?                      
## mapping     List,2                      List,2                 
## theme       List,59                     List,59                
## coordinates ?                           ?                      
## facet       ?                           ?                      
## plot_env    ?                           ?                      
## labels      List,2                      List,2                 
##             Actin.Corio.Collinsella Bacte.Bacte.Alistipes
## data        List,54                 List,54              
## layers      List,2                  List,2               
## scales      ?                       ?                    
## mapping     List,2                  List,2               
## theme       List,59                 List,59              
## coordinates ?                       ?                    
## facet       ?                       ?                    
## plot_env    ?                       ?                    
## labels      List,2                  List,2               
##             Bacte.Bacte.Bacteroides Bacte.Bacte.Barnesiella
## data        List,54                 List,54                
## layers      List,2                  List,2                 
## scales      ?                       ?                      
## mapping     List,2                  List,2                 
## theme       List,59                 List,59                
## coordinates ?                       ?                      
## facet       ?                       ?                      
## plot_env    ?                       ?                      
## labels      List,2                  List,2                 
##             Bacte.Bacte.Parabacteroides Bacte.Bacte.Prevotella
## data        List,54                     List,54               
## layers      List,2                      List,2                
## scales      ?                           ?                     
## mapping     List,2                      List,2                
## theme       List,59                     List,59               
## coordinates ?                           ?                     
## facet       ?                           ?                     
## plot_env    ?                           ?                     
## labels      List,2                      List,2                
##             Bacte.Bacte.Prevotellaceae Bacte.Bacte.S24.7
## data        List,54                    List,54          
## layers      List,2                     List,2           
## scales      ?                          ?                
## mapping     List,2                     List,2           
## theme       List,59                    List,59          
## coordinates ?                          ?                
## facet       ?                          ?                
## plot_env    ?                          ?                
## labels      List,2                     List,2
###### outlier in isgs_rlog #########
sapply(colnames(isgs_rlog )[1:10], FUN = function(x){
  dot_groupby(isgs_rlog , isgs_rlog[,x], clin$Group, 0.8, "HIV Status", 
              paste("Rescaled-rlog of", x) )
})

##             LAP3     CD38     ETV7     NUB1     SLC38A5  TYMP     TMSB10  
## data        List,246 List,246 List,246 List,246 List,246 List,246 List,246
## layers      List,2   List,2   List,2   List,2   List,2   List,2   List,2  
## scales      ?        ?        ?        ?        ?        ?        ?       
## mapping     List,2   List,2   List,2   List,2   List,2   List,2   List,2  
## theme       List,59  List,59  List,59  List,59  List,59  List,59  List,59 
## coordinates ?        ?        ?        ?        ?        ?        ?       
## facet       ?        ?        ?        ?        ?        ?        ?       
## plot_env    ?        ?        ?        ?        ?        ?        ?       
## labels      List,2   List,2   List,2   List,2   List,2   List,2   List,2  
##             EIF2AK2  PARP12   TNK2    
## data        List,246 List,246 List,246
## layers      List,2   List,2   List,2  
## scales      ?        ?        ?       
## mapping     List,2   List,2   List,2  
## theme       List,59  List,59  List,59 
## coordinates ?        ?        ?       
## facet       ?        ?        ?       
## plot_env    ?        ?        ?       
## labels      List,2   List,2   List,2
sapply(colnames(isgs_rlog )[1:10], FUN = function(x){
  dot_groupby(isgs_rlog , isgs_rlog[,x], 1, 0.8, "The whole cohort", 
              paste("Rescaled-rlog of", x) )
})

##             LAP3     CD38     ETV7     NUB1     SLC38A5  TYMP     TMSB10  
## data        List,246 List,246 List,246 List,246 List,246 List,246 List,246
## layers      List,2   List,2   List,2   List,2   List,2   List,2   List,2  
## scales      ?        ?        ?        ?        ?        ?        ?       
## mapping     List,2   List,2   List,2   List,2   List,2   List,2   List,2  
## theme       List,59  List,59  List,59  List,59  List,59  List,59  List,59 
## coordinates ?        ?        ?        ?        ?        ?        ?       
## facet       ?        ?        ?        ?        ?        ?        ?       
## plot_env    ?        ?        ?        ?        ?        ?        ?       
## labels      List,2   List,2   List,2   List,2   List,2   List,2   List,2  
##             EIF2AK2  PARP12   TNK2    
## data        List,246 List,246 List,246
## layers      List,2   List,2   List,2  
## scales      ?        ?        ?       
## mapping     List,2   List,2   List,2  
## theme       List,59  List,59  List,59 
## coordinates ?        ?        ?       
## facet       ?        ?        ?       
## plot_env    ?        ?        ?       
## labels      List,2   List,2   List,2
############### subseting #############
# microbiome whole cohort 
whole_outmibi <- apply(mibi, 2, FUN = function(x){
  ifelse(grubbs_wrapper(x, 10) == "Nonoutlier", TRUE, FALSE)
})
# 
hiv_outmibi <- apply(mibi, 2, FUN = function(x){
  ifelse(grubbs_wrapper(x, 10, "hiv") == "Nonoutlier", TRUE, FALSE)
})
sum(whole_outmibi)
## [1] 38
sum(hiv_outmibi)
## [1] 40
# BOTH outlier
sum((whole_outmibi == FALSE) & (hiv_outmibi == FALSE) )
## [1] 12
# outlier as whole 
sum((whole_outmibi == FALSE) & (hiv_outmibi == TRUE) )
## [1] 4
# BOTH outlier
which((whole_outmibi == TRUE) & (hiv_outmibi == FALSE) )
## Prote.Betaproteobacteria Prote.Epsil.Helicobacter 
##                       44                       47
# test the one different between whole and per group
sapply(colnames(mibi)[44], FUN = function(x){
  dot_groupby(mibi, mibi[,x], clin$Group, 0.8, "HIV Status", 
              paste("Rescaled-clr of", x) )
})

##             Prote.Betaproteobacteria
## data        List,54                 
## layers      List,2                  
## scales      ?                       
## mapping     List,2                  
## theme       List,59                 
## coordinates ?                       
## facet       ?                       
## plot_env    ?                       
## labels      List,2
sapply(colnames(mibi)[44], FUN = function(x){
  dot_groupby(mibi, mibi[,x], 1, 0.8, "The whole cohort", 
              paste("Rescaled-clr of", x) )
})

##             Prote.Betaproteobacteria
## data        List,54                 
## layers      List,2                  
## scales      ?                       
## mapping     List,2                  
## theme       List,59                 
## coordinates ?                       
## facet       ?                       
## plot_env    ?                       
## labels      List,2
################ subsetting only by whole ##############
isgs_outlier <- apply(isgs_rlog, 2, FUN = function(x){
  ifelse(grubbs_wrapper(x, 10) == "Nonoutlier", TRUE, FALSE)
})
dim(isgs_rlog)
## [1]  27 246
dim(isgs_rlog[, isgs_outlier])
## [1]  27 151
mibi_outlier <- apply(mibi, 2, FUN = function(x){
  ifelse(grubbs_wrapper(x, 10) == "Nonoutlier", TRUE, FALSE)
})
dim(mibi)
## [1] 27 54
dim(mibi[, mibi_outlier])
## [1] 27 38
####### unweighted ###### with outliers
run_SmCCNet(isgs_rlog, mibi, LPS, 0.35, 0.1, 0.8, 0.9, NULL, 
    n_na)
## [1] "weights can be NULL or a length 3 vector"

## NULL

## NULL

## NULL
## [[1]]
##  [1]   1   3   6  12  13  19  21  23  27  28  53  62  67  84  86 102 103
## [18] 122 126 127 129 145 150 151 158 168 187 189 191 192 198 203 205 206
## [35] 226 227 233 236 238 254
## 
## [[2]]
##  [1]   2   4   8  14  16  35  42  43  48  54  60  70  71  72  79  80  90
## [18]  91  93  96  99 107 108 111 115 117 118 119 121 123 132 152 154 160
## [35] 165 167 171 173 180 188 195 197 199 221 225 232 245 255
## 
## [[3]]
##  [1]  22  34  76  81  87  97 100 113 114 116 169 176 193 201 211 212 222
## [18] 282
# deleting all outliers
run_SmCCNet(isgs_rlog[, isgs_outlier], mibi[, mibi_outlier], 
    LPS, 0.05, 0.05, 0.8, 0.9, NULL, n_na)
## [1] "weights can be NULL or a length 3 vector"

## NULL

## NULL

## NULL

## NULL

## NULL
## [[1]]
## [1]  14 176
## 
## [[2]]
## [1]  27 174
## 
## [[3]]
## [1]  62 156
## 
## [[4]]
## [1]  77 155
## 
## [[5]]
## [1] 105 181
run_SmCCNet(isgs_rlog[, isgs_outlier], mibi[, mibi_outlier], 
    LPS, 0.6, 0.05, 0.8, 0.9, NULL, n_na)
## [1] "weights can be NULL or a length 3 vector"

## NULL

## NULL

## NULL
## [[1]]
##  [1]   1   2   6   7   8  11  19  25  30  31  32  35  36  37  38  39  49
## [18]  51  56  57  58  60  62  67  68  69  72  73  74  76  78  79  84  85
## [35]  87  89  92  94  97  99 103 105 106 108 112 120 127 128 130 131 132
## [52] 133 136 139 140 149 151 156
## 
## [[2]]
##  [1]   3   4  10  12  13  14  17  18  20  22  29  33  34  40  43  44  52
## [18]  54  55  59  63  64  71  75  77  80  82  83  90  93  95  96  98 100
## [35] 104 107 114 117 118 119 121 122 123 124 134 137 138 141 143 144 146
## [52] 147 150 155
## 
## [[3]]
##  [1]   9  15  23  24  26  27  28  42  47  61  65  66  81  86 102 111 113
## [18] 116 126 135 145 148 181
apply(mibi, 2, FUN = function(x) {
    
})
## NULL

0.2 Questions

  1. How to filter the global level of host-transcriptome?
  2. Outlier test on SmCCNet ready data or midwhere of data prepocessing?
  3. Outliers in clinical phenotype, delete some libraries?